Author: Lily Cheng. Oct. 11th, 2020

STEP 1. Read the downloaded data.

In [1]:
import s3fs
import pandas as pd
In [2]:
# csv file
train_df = pd.read_json('s3://sagemaker-studio-528576943967-ssf9zkrg3os/train.json')
In [3]:
len(train_df)
Out[3]:
39774
In [4]:
train_df
Out[4]:
id cuisine ingredients
0 10259 greek [romaine lettuce, black olives, grape tomatoes...
1 25693 southern_us [plain flour, ground pepper, salt, tomatoes, g...
2 20130 filipino [eggs, pepper, salt, mayonaise, cooking oil, g...
3 22213 indian [water, vegetable oil, wheat, salt]
4 13162 indian [black pepper, shallots, cornflour, cayenne pe...
... ... ... ...
39769 29109 irish [light brown sugar, granulated sugar, butter, ...
39770 11462 italian [KRAFT Zesty Italian Dressing, purple onion, b...
39771 2238 irish [eggs, citrus fruit, raisins, sourdough starte...
39772 41882 chinese [boneless chicken skinless thigh, minced garli...
39773 2362 mexican [green chile, jalapeno chilies, onions, ground...

39774 rows × 3 columns

Step 2. Re-organize Dataframe

In [5]:
# https://stackoverflow.com/questions/50217968/pandas-split-list-in-column-into-multiple-rows
new_df = pd.DataFrame([
    [p, t, i] for p, t, I in train_df.values
    for i in I
], columns=train_df.columns)
In [6]:
new_new_df = new_df.drop(columns=['cuisine'])
In [7]:
new_new_df
Out[7]:
id ingredients
0 10259 romaine lettuce
1 10259 black olives
2 10259 grape tomatoes
3 10259 garlic
4 10259 pepper
... ... ...
428270 2362 garlic
428271 2362 white sugar
428272 2362 roma tomatoes
428273 2362 celery
428274 2362 dried oregano

428275 rows × 2 columns

Step 3. Data Exploration

In [87]:
# conda install -c conda-forge cufflinks-py
In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
plt.style.use('ggplot')

import cufflinks as cf
import plotly.express as px
import plotly.offline as py
from plotly.offline import plot
import plotly.graph_objects as go
import plotly.graph_objs as go

# try:
#     import apyori
# except:
#     !pip install apyori

# from apyori import apriori
In [11]:
#Graph : Item by count
fig = px.bar(new_new_df["ingredients"].value_counts()[:30], orientation="v", color=new_new_df["ingredients"].value_counts()[:30], color_continuous_scale=px.colors.sequential.Plasma, 
             log_x=False, labels={'value':'Count', 
                                'index':'Item',
                                 'color':'None'
                                })

fig.update_layout(
    font_color="black",
    title_font_color="red",
    legend_title_font_color="green",
    title_text="Item by count"
)

fig.show()

There are some redundancy in the table, such as "egg" and "big eggs", "onion" and "purple onions", "pepper" and "grounded black pepper", and so on

In [12]:
df_no_redundancy = new_new_df.copy()
In [13]:
df_no_redundancy['ingredients'] = new_new_df['ingredients'].replace(['large eggs','eggs','purple onion','onions','ground black pepper','black pepper','olive oil','vegetable oil','sesame oil', 'garlic cloves', 'extra-virgin olive oil', 'minced garlic', 'unsalted butter', 'kosher salt', 'diced tomatoes', 'fresh ginger', 'brown sugar', 'garlic powder','fresh lemon juice','fresh lime juice', 'fresh parsley', 'cooking oil', 'canola oil', 'all-purpose flour', 'chopped onion', 'grated parmesan cheese', 'chopped cilantro fresh','ground cinnamon', 'fresh lime juice','boneless skinless chicken breasts', 'yellow onion', 'flat leaf parsley', 'lime juice','cilantro leaves','heavy cream', 'fresh basil','white sugar', 'jalapeno chilies'], 
                                                                    ['egg',       'egg', 'onion',       'onion',               'pepper',     'pepper',      'oil',           'oil',      'oil',  'garlic',        'oil',                    'garlic',        'butter',          'salt',        'tomatoes',       'ginger',       'sugar',       'garlic',       'lemon juice',      'fresh lime juice',  'parsley',      'oil',         'oil',        'flour',            'onion',          'parmesan cheese',             'cilantro',        'cinnamon',         'lime',            'chicken breasts',                'oninon',          'parsley',           'lime',      'cilantro',       'cream',      'basil',       'sugar',       'jalapeno'])

salt, waterm and sugar are not really "food items" and should be ignored

In [14]:
df_no_season = df_no_redundancy[~df_no_redundancy.ingredients.isin(['salt','sugar','water','cooking spray','baking powder'])]
In [15]:
#Graph : Item by count
fig = px.bar(df_no_season["ingredients"].value_counts()[:30], orientation="v", color=df_no_season["ingredients"].value_counts()[:30], color_continuous_scale=px.colors.sequential.Plasma, 
             log_x=False, labels={'value':'Count', 
                                'index':'Item',
                                 'color':'None'
                                })

fig.update_layout(
    font_color="black",
    title_font_color="red",
    legend_title_font_color="green",
    title_text="Item by count"
)

fig.show()

Feature Engineering

In [19]:
# !pip install mlxtend  
In [20]:
# !pip install squarify
In [21]:
from mlxtend.preprocessing import TransactionEncoder
import matplotlib
import squarify
In [22]:
basket = df_no_season
basket.head()
Out[22]:
id ingredients
0 10259 romaine lettuce
1 10259 black olives
2 10259 grape tomatoes
3 10259 garlic
4 10259 pepper
In [23]:
transactions = [a[1]['ingredients'].tolist() for a in list(basket.groupby(['id']))]
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
transactions = pd.DataFrame(te_ary, columns=te.columns_)
pf = transactions.describe()
f = pf.iloc[0]-pf.iloc[3]
a = f.tolist()
b = list(f.index)
item = pd.DataFrame([[a[r],b[r]]for r in range(len(a))], columns=['Count','Item'])
item = item.sort_values(['Count'], ascending=False).head(50)
transactions
Out[23]:
( oz.) tomato sauce ( oz.) tomato paste (10 oz.) frozen chopped spinach (10 oz.) frozen chopped spinach, thawed and squeezed dry (14 oz.) sweetened condensed milk (14.5 oz.) diced tomatoes (15 oz.) refried beans 1% low-fat buttermilk 1% low-fat chocolate milk 1% low-fat cottage cheese ... yukon gold potatoes yuzu yuzu juice za'atar zest zesty italian dressing zinfandel ziti zucchini zucchini blossoms
0 False False False False False False False False False False ... False False False False False False False False False False
1 False False False False False False False False False False ... False False False False False False False False False False
2 False False False False False False False False False False ... False False False False False False False False False False
3 False False False False False False False False False False ... False False False False False False False False False False
4 False False False False False False False False False False ... False False False False False False False False False False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
39768 False False False False False False False False False False ... False False False False False False False False False False
39769 False False False False False False False False False False ... False False False False False False False False False False
39770 False False False False False False False False False False ... False False False False False False False False False False
39771 False False False False False False False False False False ... False False False False False False False False False False
39772 False False False False False False False False False False ... False False False False False False False False False False

39773 rows × 6676 columns

In [24]:
fig, ax = plt.subplots(figsize=(18,10))
cmap = matplotlib.cm.coolwarm

mini = min(item["Count"])
maxi = max(item["Count"])

norm = matplotlib.colors.Normalize(vmin=mini, vmax=maxi)
colors = [cmap(norm(value)) for value in item["Count"]]

squarify.plot(sizes=item["Count"], label=item["Item"], alpha=0.8, color=colors)
plt.axis('off')
plt.title("Top 50 Frequent Recipe Items", fontsize=32)
ttl = ax.title
ttl.set_position([.5, 1.05])

Apriori Model

In [25]:
from mlxtend.frequent_patterns import apriori, association_rules 
In [31]:
frequent_itemset = apriori(transactions, min_support=0.009, use_colnames=True, max_len=5)
In [32]:
#frequent_itemsets = apriori(transactions, min_support=0.015, use_colnames=True, max_len=5)
frequent_itemsets = frequent_itemset
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
frequent_itemsets # This table gives all association rules for basket analysis
Out[32]:
support itemsets length
0 0.009479 (active dry yeast) 1
1 0.030875 (avocado) 1
2 0.015588 (bacon) 1
3 0.023684 (baking soda) 1
4 0.014432 (balsamic vinegar) 1
... ... ... ...
809 0.009127 (oil, ground cumin, pepper, onion) 4
810 0.012722 (oil, pepper, onion, parsley) 4
811 0.017399 (tomatoes, oil, pepper, onion) 4
812 0.009353 (onion, garlic, pepper, oil, parsley) 5
813 0.013074 (onion, garlic, pepper, oil, tomatoes) 5

814 rows × 3 columns

In [33]:
b = association_rules(frequent_itemsets, metric="lift", min_threshold=0.001)
b['uni'] = np.nan
b['ant'] = np.nan
b['con'] = np.nan
b['tot'] = 39773
In [34]:
transactions = [a[1]['ingredients'].tolist() for a in list(basket.groupby(['id']))]

def trans():
    for t in transactions:
        yield t
    
def ant(x):
    cnt = 0
    for t in trans():
        t = set(t)
        if x.intersection(t) == x:
            cnt = cnt + 1 
    return cnt

bb = b.values.tolist()  
In [36]:
rules_dict = []
for bbb in bb:
    bbb[10] = ant(bbb[0])
    bbb[11] = ant(bbb[1])
    bbb[9] = ant(bbb[0].union(bbb[1]))
    diction = {
        'lhs': tuple(bbb[0]), 
        'rhs': tuple(bbb[1]),
        'count_full': bbb[9],
        'count_lhs': bbb[10],
        'count_rhs': bbb[11],
        'num_transactions': bbb[12]
    }
    rules_dict.append(diction)
    

Visualization

In [88]:
# pip install --index-url https://test.pypi.org/simple/ PyARMViz
In [81]:
from PyARMViz import PyARMViz
In [82]:
from PyARMViz.Rule import generate_rule_from_dict
In [83]:
rules = []
for rd in rules_dict: 
    rules.append(generate_rule_from_dict(rd))

Parallel Axis Plot

In [84]:
PyARMViz.generate_parallel_category_plot(rules)

Network plot

In [85]:
PyARMViz.generate_rule_graph_plotly(rules)
In [86]:
PyARMViz.generate_rule_strength_plot(rules)
In [ ]: